# Introducing Virco data
virco <- read.csv(file='Virco_data.csv', header=TRUE, sep=',')
attach(virco)
IDV.Fold[1:10]
P71[1:10]

# Preparing Virco data
VircoGenoBin <- data.frame(virco[,substr(names(virco),1,1)=='P']!='-')
VircoGenoBin[1:10,71]
Trait <- as.factor(IDV.Fold>NFV.Fold)
Trait[1:10]

# Classification tree for Virco data
install.packages('rpart')
library(rpart)
ClassTree <- rpart(Trait~., method='class', data=VircoGenoBin)
ClassTree
plot(ClassTree)
text(ClassTree)
rpart(Trait~., method='class', parms=list(split='information'), data=VircoGenoBin)
rpart(Trait~., method='class', parms=list(split='gini'), control=rpart.control(minsplit=150, minbucket=50), data=VircoGenoBin)

# Regression tree for Virco data
Trait <- NFV.Fold-IDV.Fold
RegTree <- rpart(Trait~., method='anova', data=VircoGenoBin)
RegTree
plot(RegTree)
text(RegTree)

# Other types of predictors
VircoGenoCat <- data.frame(virco[,substr(names(virco),1,1)=='P'])
VircoGenoCat[1:10,71]
VircoGenoOrd <- sapply(VircoGenoCat, as.numeric)
VircoGenoOrd[1:10,71]
Tree = rpart(APV.Fold~., method='anova', data=VircoGenoCat)
Tree

# Pruning regression tree for Virco data
set.seed(1980)
Tree = rpart(APV.Fold~., method='anova', data=VircoGenoCat)
Tree
printcp(Tree)
plotcp(Tree)
cutoff <- (Tree$cptable[,'xerror']+Tree$cptable[,'xstd'])[which.min(Tree$cptable[,'xerror'])]
cutoff
cpvalue <- Tree$cptable[which(Tree$cptable[,'xerror']<cutoff)[1],'CP']
cpvalue
PrunedTree <-prune(Tree,cp=cpvalue)
PrunedTree
plot(Tree)
plot(PrunedTree)

# Random forest for Virco data
install.packages('randomForest')
library(randomForest)
Trait.c <- Trait[!is.na(Trait)]
VircoGenoBin.c <- VircoGenoBin[!is.na(Trait),]
set.seed(1980)
RegRF <- randomForest(VircoGenoBin.c, Trait.c, importance=TRUE)
RegRF$'importance'[order(RegRF$'importance'[,1], decreasing=TRUE),]
varImpPlot(RegRF)

# Introducing missing genotypes in Virco data
VircoGenoCat.c <- VircoGenoCat[!is.na(Trait),]
mean(is.na(VircoGenoCat.c))
VircoGenoCat.m <- VircoGenoCat.c
set.seed(1980)
makeNA <- matrix(sample(c(FALSE,TRUE), nrow(VircoGenoCat.c)*ncol(VircoGenoCat.c), replace=TRUE, prob=c(0.95, 0.05)), nrow=nrow(VircoGenoCat.c), ncol=ncol(VircoGenoCat.c))
VircoGenoCat.m[makeNA] <- NA
mean(is.na(VircoGenoCat.m))

# RF with missing predictors for Virco data
VircoGenoCat.r <- na.roughfix(VircoGenoCat.m)
mean(is.na(VircoGenoCat.r))
1-mean(VircoGenoCat.r!=VircoGenoCat.c)/mean(is.na(VircoGenoCat.m))
table(VircoGenoCat.c$P71)
table(VircoGenoCat.m$P71)
table(VircoGenoCat.r$P71)
set.seed(1980)
RF.r <- randomForest(VircoGenoCat.r, Trait.c, importance=TRUE)
RF.r
varImpPlot(RF.r)